{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/soda/rcappuzz/work/benchmark-join-suggestions\n"
     ]
    }
   ],
   "source": [
    "cd ~/bench"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook is used to combine all results from various runs in to a single file for simplicity and better storage. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "polars.config.Config"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "from pathlib import Path\n",
    "from src.utils.logging import read_and_process, read_logs\n",
    "\n",
    "cfg = pl.Config()\n",
    "cfg.set_fmt_str_lengths(150)\n",
    "cfg.set_tbl_rows(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dest_path = Path(\"results/overall\")\n",
    "overall_list = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wordnet 10k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0428-hde9c7an\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_raw.write_parquet(Path(dest_path, \"wordnet_10k-first.parquet\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wordnet old + Binary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0429-oot6vo5b\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_raw.filter(pl.col(\"target_dl\") == \"binary_update\").write_parquet(\n",
    "#     Path(dest_path, \"binary_update-first.parquet\")\n",
    "# )\n",
    "# df_raw.filter(pl.col(\"target_dl\") == \"wordnet_full\").write_parquet(\n",
    "#     Path(dest_path, \"wordnet_full-first.parquet\")\n",
    "# )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Open Data US"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "r1_path = \"results/logs/0434-7r9ecumo\"\n",
    "df_1 = read_logs(exp_name=None, exp_path=r1_path)\n",
    "df_1 = (\n",
    "    df_1.fill_null(0)\n",
    "    .with_columns(pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\"))\n",
    "    .filter(~pl.col(\"base_table\").str.contains(\"schools\"))\n",
    ")\n",
    "\n",
    "r2_path = \"results/logs/0435-y2ljs95x\"\n",
    "df_2 = read_logs(exp_name=None, exp_path=r2_path)\n",
    "df_2 = (\n",
    "    df_2.fill_null(0)\n",
    "    .with_columns(pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\"))\n",
    "    .filter(~pl.col(\"base_table\").str.contains(\"schools\"))\n",
    ")\n",
    "\n",
    "# Schools\n",
    "r_schools_path = \"results/logs/0444-4ptu03x5\"\n",
    "df_3 = read_logs(exp_name=None, exp_path=r_schools_path)\n",
    "df_3 = df_3.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"r2score\"), pl.lit(0.0).alias(\"rmse\")\n",
    ")\n",
    "\n",
    "df_raw = pl.concat([df_1, df_2, df_3])\n",
    "df_raw.filter(pl.col(\"base_table\").str.contains(\"depleted\")).write_parquet(\n",
    "    Path(dest_path, \"open_data_us-first.parquet\")\n",
    ")\n",
    "\n",
    "# df_raw.write_parquet(Path(dest_path, \"open_data_us-first-with_full_tables.parquet\"))\n",
    "\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wordnet 50k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0430-e6fv2b39\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)\n",
    "# df_raw.write_parquet(Path(dest_path, \"wordnet_50k-first.parquet\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# New tables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Movies large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0451-gm4f305a\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## US Accidents large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0453-sjkk0koq\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Both only minhash hybrid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0454-s9y3tkn5\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Both wordnet 50k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0471-de9mv1sy\"\n",
    "# r_path = \"results/logs/0455-4dzbjvu2\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Both Open Data US"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0469-p8d11fut\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)\n",
    "\n",
    "r_path = \"results/logs/0470-ngbqan0d\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Missing Starmie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0484-g03mb39x\"\n",
    "# r_path = \"results/logs/0455-4dzbjvu2\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0486-hi5dzppz\"\n",
    "# r_path = \"results/logs/0455-4dzbjvu2\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Missing VLDB50k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0487-awgal4zr\"\n",
    "# r_path = \"results/logs/0455-4dzbjvu2\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extra runs\n",
    "The following runs are not added to the \"overall\" logs because they were not run with all methods."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wordnet 3K"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_path = \"results/logs/0424-xc16vll9\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "# overall_list.append(df_raw)\n",
    "df_raw.write_parquet(Path(dest_path, \"wordnet_3k-first.parquet\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combine all runs (aggr=first)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])\n"
     ]
    }
   ],
   "source": [
    "for df in overall_list:\n",
    "    print(df.schema)\n",
    "df_overall = pl.concat(overall_list)\n",
    "df_overall.write_parquet(Path(dest_path, \"overall_first.parquet\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "###### Method: exact_matching\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: open_data_us\n",
      "---> us_county_population\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_50\n",
      "---> schools\n",
      "###### Method: minhash\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: open_data_us\n",
      "---> us_county_population\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_50\n",
      "---> schools\n",
      "###### Method: minhash_hybrid\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: open_data_us\n",
      "---> us_county_population\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_50\n",
      "---> schools\n",
      "###### Method: starmie\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n"
     ]
    }
   ],
   "source": [
    "target_tables = [\n",
    "    \"company_employees\",\n",
    "    \"housing_prices\",\n",
    "    \"movies_large\",\n",
    "    \"us_accidents_2021\",\n",
    "    \"us_accidents_large\",\n",
    "    \"us_county_population\",\n",
    "    \"us_elections\",\n",
    "    \"schools\"\n",
    "]\n",
    "for jd in [\"exact_matching\", \"minhash\", \"minhash_hybrid\", \"starmie\"]:\n",
    "    print(f\"###### Method: {jd}\")\n",
    "    g = (\n",
    "        df_overall.filter(pl.col(\"jd_method\") == jd)\n",
    "        .select(\"target_dl\", \"base_table\")\n",
    "        .sort(\"target_dl\")\n",
    "        .group_by([\"target_dl\"], maintain_order=True)\n",
    "    )\n",
    "    for gidx, group in g:\n",
    "        print(f\"Data lake: {gidx[0]}\")\n",
    "        tb = group.unique()[\"base_table\"].str.split(\"-\").list.first().to_list()\n",
    "        for m in [_ for _ in target_tables if _ not in tb]:\n",
    "            print(\"--->\", m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Aggregation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "dest_path = Path(\"results/overall\")\n",
    "overall_list = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregation Mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "runs_to_evaluate = [\n",
    "    \"0459\",\n",
    "    \"0494\",\n",
    "    \"0495\",\n",
    "    \"0496\",\n",
    "    \"0497\",\n",
    "]\n",
    "\n",
    "df_list = []\n",
    "\n",
    "for r_path in Path(\"results/logs\").iterdir():\n",
    "    s = r_path.stem\n",
    "    run_id, _ = s.split(\"-\")\n",
    "    if run_id in runs_to_evaluate:\n",
    "        df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "        df_raw = df_raw.fill_null(0).with_columns(\n",
    "            pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    "        )\n",
    "        df_list.append(df_raw)\n",
    "df_concat = pl.concat(df_list)\n",
    "overall_list.append(df_concat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Only wordnet 10k\n",
    "r_path = \"results/logs/0438-yboxtpbz\"\n",
    "df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "df_raw = df_raw.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    ")\n",
    "# overall_list.append(df_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregation DFS\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "runs_to_evaluate = [\n",
    "    \"0457\",\n",
    "    \"0467\",\n",
    "    \"0468\",\n",
    "    \"0476\",\n",
    "    \"0477\",\n",
    "    \"0478\",\n",
    "    \"0481\",\n",
    "    \"0482\",\n",
    "    \"0483\",\n",
    "    \"0485\",\n",
    "]\n",
    "\n",
    "df_list = []\n",
    "\n",
    "for r_path in Path(\"results/logs\").iterdir():\n",
    "    s = r_path.stem\n",
    "    run_id, _ = s.split(\"-\")\n",
    "    if run_id in runs_to_evaluate:\n",
    "        df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "        df_raw = df_raw.fill_null(0).with_columns(\n",
    "            pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    "        )\n",
    "        df_list.append(df_raw)\n",
    "\n",
    "r_schools_path = \"results/logs/0489-ceg17qz5\"\n",
    "df_ = read_logs(exp_name=None, exp_path=r_schools_path)\n",
    "df_ = df_.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"r2score\"), pl.lit(0.0).alias(\"rmse\")\n",
    ")\n",
    "df_list.append(df_)\n",
    "\n",
    "df_concat = pl.concat(df_list)\n",
    "\n",
    "overall_list.append(df_concat)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregation first"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "runs_to_evaluate = [\n",
    "    \"0501\",\n",
    "    \"0500\",\n",
    "    \"0502\"\n",
    "]\n",
    "\n",
    "df_list = []\n",
    "\n",
    "for r_path in Path(\"results/logs\").iterdir():\n",
    "    s = r_path.stem\n",
    "    run_id, _ = s.split(\"-\")\n",
    "    if run_id in runs_to_evaluate:\n",
    "        df_raw = read_logs(exp_name=None, exp_path=r_path)\n",
    "        df_raw = df_raw.fill_null(0).with_columns(\n",
    "            pl.lit(0.0).alias(\"auc\"), pl.lit(0.0).alias(\"f1score\")\n",
    "        )\n",
    "        df_list.append(df_raw)\n",
    "df_concat = pl.concat(df_list)\n",
    "overall_list.append(df_concat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "r_schools_path = \"results/logs/0503-oqb8lwhc\"\n",
    "df_ = read_logs(exp_name=None, exp_path=r_schools_path)\n",
    "df_ = df_.fill_null(0).with_columns(\n",
    "    pl.lit(0.0).alias(\"r2score\"), pl.lit(0.0).alias(\"rmse\")\n",
    ")\n",
    "overall_list.append(df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', String), ('epsilon', String)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', String), ('epsilon', String)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', String), ('epsilon', String)])\n",
      "OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', String), ('epsilon', String)])\n"
     ]
    }
   ],
   "source": [
    "for df in overall_list:\n",
    "    print(df.schema)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_overall = pl.concat(overall_list)\n",
    "df_overall.write_parquet(\"results/overall/overall_aggr.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "####### Method: exact_matching\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: open_data_us\n",
      "---> us_county_population\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_50\n",
      "---> schools\n",
      "####### Method: minhash\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: open_data_us\n",
      "---> us_county_population\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_50\n",
      "---> schools\n",
      "####### Method: minhash_hybrid\n",
      "Data lake: binary_update\n",
      "---> schools\n",
      "Data lake: open_data_us\n",
      "---> us_county_population\n",
      "Data lake: wordnet_full\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_10\n",
      "---> schools\n",
      "Data lake: wordnet_vldb_50\n",
      "---> schools\n"
     ]
    }
   ],
   "source": [
    "target_tables = [\n",
    "    \"company_employees\",\n",
    "    # \"housing_prices\",\n",
    "    \"movies_large\",\n",
    "    \"us_accidents_2021\",\n",
    "    # \"us_accidents_large\",\n",
    "    \"us_county_population\",\n",
    "    # \"us_elections\",\n",
    "    \"schools\"\n",
    "]\n",
    "\n",
    "target_data_lakes = [\n",
    "    \"wordnet_full\",\n",
    "    \"wordnet_vldb_10\",\n",
    "    \"wordnet_vldb_50\",\n",
    "    \"binary_update\",\n",
    "    \"open_data_us\",\n",
    "]\n",
    "for jd in [\"exact_matching\", \"minhash\", \"minhash_hybrid\"]:\n",
    "    print(f\"####### Method: {jd}\")\n",
    "    g = (\n",
    "        df_overall.filter((pl.col(\"jd_method\") == jd) & (pl.col(\"aggregation\") == \"first\"))\n",
    "        .select(\"target_dl\", \"base_table\")\n",
    "        .sort(\"target_dl\")\n",
    "        .group_by([\"target_dl\"], maintain_order=True)\n",
    "    )\n",
    "    found = []\n",
    "    for gidx, group in g:\n",
    "        print(f\"Data lake: {gidx[0]}\")\n",
    "        found.append(gidx[0])\n",
    "        tb = group.unique()[\"base_table\"].str.split(\"-\").list.first().to_list()\n",
    "        for m in [_ for _ in target_tables if _ not in tb]:\n",
    "            print(\"--->\", m)\n",
    "    for _ in target_data_lakes:\n",
    "        if _ not in found:\n",
    "            print(\"Missing data lake:\", _)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "exact_matching\n",
      "shape: (5,)\n",
      "Series: 'target_dl' [str]\n",
      "[\n",
      "\t\"wordnet_vldb_10\"\n",
      "\t\"open_data_us\"\n",
      "\t\"wordnet_vldb_50\"\n",
      "\t\"wordnet_full\"\n",
      "\t\"binary_update\"\n",
      "]\n",
      "minhash\n",
      "shape: (5,)\n",
      "Series: 'target_dl' [str]\n",
      "[\n",
      "\t\"wordnet_full\"\n",
      "\t\"wordnet_vldb_50\"\n",
      "\t\"wordnet_vldb_10\"\n",
      "\t\"binary_update\"\n",
      "\t\"open_data_us\"\n",
      "]\n",
      "minhash_hybrid\n",
      "shape: (5,)\n",
      "Series: 'target_dl' [str]\n",
      "[\n",
      "\t\"wordnet_vldb_50\"\n",
      "\t\"wordnet_full\"\n",
      "\t\"wordnet_vldb_10\"\n",
      "\t\"open_data_us\"\n",
      "\t\"binary_update\"\n",
      "]\n"
     ]
    }
   ],
   "source": [
    "for jd in [\"exact_matching\", \"minhash\", \"minhash_hybrid\"]:\n",
    "    print(jd)\n",
    "    print(df_overall.filter((pl.col(\"jd_method\") == jd) & (pl.col(\"aggregation\") == \"dfs\"))[\"target_dl\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
